This document contains all the analyses done on ONT DNA data that was generated for the tailfindr paper. Knit this R markdown file after you have successfully run drake::r_make().

Load the required libraries first:

pacman::p_load(dplyr, magrittr, ggplot2, drake, 
               knitr, ggpubr, here, tidyverse)

Now load the data:

loadd(dna_kr_data)
# make a version of data in which tail lengths are capped to 300
dna_kr_data_capped <- dna_kr_data %>% 
  mutate(tail_length_ff = ifelse(tail_length_ff > 300, 300, tail_length_ff)) %>% 
  mutate(tail_length_st = ifelse(tail_length_st > 300, 300, tail_length_st))

1 About the data

Here is a description of columns of dna_kr_data:

knitr::kable(col_names_df)
Columns Description
read_id Read ID
tail_start_ff tailfindr estimate of poly(A) start site based on flipflop basecalling
tail_end_ff tailfindr estimate of poly(A) end site based on flipflop basecalling
samples_per_nt_ff tailfindr estimate of read-specific translocation rate in units of samples per nucleotide based on flipflop basecalling
tail_length_ff tailfindr estimate of poly(A) tail length based on flipflop basecalling
tail_start_st tailfindr estimate of poly(A) start site from standard model basecalling
tail_end_st tailfindr estimate of poly(A) end site from standard model basecalling
samples_per_nt_st tailfindr estimate of read-specific translocation rate in units of samples per nucleotide from standard model basecalling
tail_length_st tailfindr estimate of poly(A) tail length from standard model basecalling
read_type Whether the read is poly(A) or poly(T) read
barcode Expected poly(A)/(T) tail length from spikeins
replicate Replicate No
file_path Full file path (relevant only for use within Valen lab)
transcript_alignment_start_st Location of tail end by eGFP sequence alignment (standard model basecalling)
transcript_alignment_start_ff Location of tail end by eGFP sequence alignment (flipflop model basecalling)
moves_in_tail_st Moves between the tail boundaries in data basecalled using standard model
moves_in_tail_ff Moves between the tail boundaries in data basecalled using flipflop model
kit Sequencing kit used

1.1 Data summary

# define the function for computing standard error 
std_err <- function(x) sd(x, na.rm = TRUE)/sqrt(length(x))

# summarize the data and display a table
summary_data <- dna_kr_data %>% group_by(barcode, read_type) %>% 
  summarise(read_count = n(),
            mean = mean(tail_length_st, na.rm = TRUE),
            median = median(tail_length_st, na.rm = TRUE),
            std_dev = sd(tail_length_st, na.rm = TRUE),
            std_err = std_err(tail_length_st)) 
summary_data %<>% mutate(cof_var = std_dev/mean)
kable(summary_data)
barcode read_type read_count mean median std_dev std_err cof_var
10 polyA 5462 21.26826 13.060 25.56187 0.3458730 1.2018789
10 polyT 11072 16.22762 12.120 19.80606 0.1882284 1.2205155
30 polyA 13063 34.44375 31.210 16.90389 0.1478990 0.4907680
30 polyT 17087 31.65191 29.980 15.14715 0.1158772 0.4785540
40 polyA 6946 42.09980 40.290 17.44976 0.2093737 0.4144857
40 polyT 13811 39.02614 39.480 16.64152 0.1416056 0.4264200
60 polyA 8261 57.68607 59.140 18.89761 0.2079173 0.3275940
60 polyT 10072 53.26727 59.110 24.55896 0.2447103 0.4610517
100 polyA 3015 93.58427 96.820 23.11061 0.4208891 0.2469497
100 polyT 3166 91.69956 101.175 34.41122 0.6115678 0.3752604
150 polyA 1767 126.09282 138.460 41.75618 0.9933504 0.3311543
150 polyT 2535 119.84817 130.150 50.31467 0.9993224 0.4198201

2 tailfindr tail length estimation across replicates

To find out how robust the tail length estimated by tailfindr is across technical replicates:

p <- ggplot(dna_kr_data_capped, aes(x = barcode, y = tail_length_st, 
                                    color = replicate, fill = replicate)) +
  geom_two_sided_flat_violin(position = position_nudge(x = 0, y = 0), alpha = .5) +
  facet_grid(~barcode, scales = 'free') +
  geom_hline(aes(yintercept = as.numeric(as.character(barcode)))) 

3 tailfindr tail length comparison between poly(A) and poly(T) read types

To address whether estimated tail lengths of poly(A) and poly(T) reads are comparable:

p <- ggplot(dna_kr_data_capped, aes(x = barcode, y = tail_length_st, 
                                    color = read_type, fill = read_type)) +
  geom_two_sided_flat_violin(position = position_nudge(x = 0, y = 0), alpha = .5) +
  facet_grid(~barcode, scales = 'free') +
  geom_hline(aes(yintercept = as.numeric(as.character(barcode))))

4 tailfindr tail length comparison between flipflop and standard basecalling

To test whether basecalling strategy (standard model vs flip-flop model basecalling) has an influence on poly(A) length estimation:

# make data first
long_dna_data <- dna_kr_data %>% select(barcode, tail_length_ff, tail_length_st) %>% 
  gather(key = 'basecaller', value = 'tail_length', tail_length_ff, tail_length_st)

p <- ggplot(long_dna_data, aes(x = barcode, y = tail_length, 
                             color = basecaller, fill = basecaller)) +
  geom_two_sided_flat_violin(position = position_nudge(x = 0, y = 0), alpha = .5) +
  facet_grid(~barcode, scales = 'free') +
  geom_hline(aes(yintercept = as.numeric(as.character(barcode))))

4.1 tail length flipflop vs standard basecalling (Poly(A) reads)

p <- ggplot(data = filter(dna_kr_data, read_type == 'polyA'), 
            aes(x = tail_length_st, y = tail_length_ff)) +
  geom_point(alpha = 0.02) +
  geom_abline(intercept = 0, slope = 1, color="red", 
                linetype = 'dotted', size = 0.7) +
  geom_smooth(method = 'lm',formula = y~x, color="#797979", 
                fullrange = TRUE, se = FALSE, size = 0.5) +
  stat_cor(method = "pearson", label.x = 10, label.y = 250) 

4.2 tail length flipflop vs standard basecalling (Poly(T) reads)

p <- ggplot(data = filter(dna_kr_data, read_type == 'polyT'), 
            aes(x = tail_length_st, y = tail_length_ff)) +
  geom_point(alpha = 0.02) +
  geom_abline(intercept = 0, slope = 1, color="red", 
                linetype = 'dotted', size = 0.7) +
  geom_smooth(method = 'lm',formula = y~x, color="#797979", 
                fullrange = TRUE, se = FALSE, size = 0.5) +
  stat_cor(method = "pearson", label.x = 10, label.y = 250) 

5 tailfindr tail end estimate vs. tail end obtained by alignment of eGFP

First a new column transcript_end_tfis produced in our dataset. This column holds the tailfindr boundary location which is adjacent to transcript:

dna_kr_data %<>% 
  mutate(transcript_end_tf = ifelse(read_type == 'polyA', 
                                    tail_start_ff, tail_end_ff))

To visualize whether the coordinates of the tail end estimated by tailfindr match up with those obtained from the alignment with eGFP sequence:

p <- ggplot(dna_kr_data, aes(x = transcript_end_tf, y = transcript_alignment_start_ff)) +
    geom_point(shape = 21, colour = 'black', fill = 'black', 
               size = 2, stroke=0, alpha = 0.1) +
    geom_abline(intercept = 0, slope = 1, color="red", 
                linetype = 'dotted', size = 0.7) +
    geom_smooth(method = 'lm',formula = y~x, color="#797979", 
                fullrange = TRUE, se = FALSE, size = 0.5) +
    stat_cor(method = "pearson", label.x = 1000, label.y = 25000) +
    coord_cartesian(xlim = c(0, 30000), ylim = c(0, 30000)) +
  facet_grid(read_type~.)

To better visualize the difference, the same information is plotted as histogram:

hist_data <- mutate(dna_kr_data, diff = transcript_end_tf - transcript_alignment_start_ff) 
p <- ggplot(hist_data, aes(x = diff)) +
  geom_histogram(binwidth = 1, fill = "grey50", size = 0, alpha = 0.6) +
  facet_grid(read_type~.) 

6 Analysis of spurious peak around 12 in the density plots

First, classify the reads into erroneous and non-erroneous read

# Erroneous reads have tail length between 9 and 15
spurious_peak_data <- dna_kr_data %>% 
  filter(barcode == 60 | barcode == 100 | barcode == 150) %>% 
  mutate(read_classification = 
           if_else(tail_length_st >= 9 & tail_length_st <= 15, 
                   'erroneous', 
                   'non-erroneous')) 

6.1 Read rate density from erroneous vs non-erroneous reads

p <- ggplot(spurious_peak_data, aes(x = samples_per_nt_st, 
                                    color = read_classification)) +
  geom_line(stat = 'density')

6.2 Raw tail length vs. normalizer for erroneous and non-erroneous reads

spurious_peak_data %<>% 
  mutate(tail_length_in_samples_st = tail_end_st - tail_start_st) %>% 
  mutate(barcode = as.character(barcode)) %>% 
  mutate(barcode = ifelse(read_classification == 'erroneous', 'erroneous', barcode)) %>% 
  mutate(barcode = fct_relevel(barcode, "erroneous", "60", "100", "150"))

p <- ggplot(spurious_peak_data, aes(x = tail_length_in_samples_st,
                                    y = samples_per_nt_st,
                                    color = barcode)) +
  geom_point(alpha = 0.5, stroke = 0, size = 2)

6.3 eGFP alignment end vs tailfindr end on erroneoush vs non-erroneous reads

spurious_peak_data %<>%
  mutate(transcript_end_st = ifelse(read_type == 'polyA',
                                    tail_start_st,
                                    tail_end_st)) %>%
  mutate(diff = transcript_end_st - transcript_alignment_start_st) %>%
  mutate(
    read_classification =
      case_when(
        read_type == 'polyT' &
          read_classification == 'erroneous' ~ "erroneous_polyt",
        read_type == 'polyT' &
          read_classification == 'non-erroneous' ~ "non-erroneous_polyt",
        read_type == 'polyA' &
          read_classification == 'erroneous' ~ "erroneous_polya",
        read_type == 'polyA' &
          read_classification == 'non-erroneous' ~ "non-erroneous_polya"
      )
  ) %>% 
  mutate(read_classification = fct_relevel(read_classification,
                                           "erroneous_polyt",
                                           "non-erroneous_polyt",
                                           "erroneous_polya", 
                                           "non-erroneous_polya"))

p <- ggplot(spurious_peak_data, aes(x = diff,
                                    color = read_classification)) +
  geom_line(stat = 'density', size = 0.9, position = 'identity') 

7 Justification for DNA thresholds

To define poly(A)/(T) sections in DNA sequencing, our algorithm thresholds normalised raw signal data. Threshold values are 0.3 for poly(T) reads and 0.6 for poly(A) reads. Based on provided model data used for basecalling, the expected mean for both these homopolymeric nucleotide stretches should be well below that threshold after normalisation (see below). Since the signal value can vary, we aimed to set a robust threshold. We thus decided to set the threshold in a way to contain all expected signal value within 2 standard deviations from the mean. As shown below, the expected signal value 2 standard deviations from the expected mean is still contained within the chosen thresholds.

model_180mv <- here('data', 'r9.4_180mv_450bps_6mer_template_median68pA.model')
df_180mv <- read.csv(model_180mv, header = TRUE, 
                     stringsAsFactors = FALSE, sep = '\t') 

df_180mv <- as.tibble(df_180mv)

# sample 1000 numbers from a Gaussian distribution for each kmer mu and sd
set.seed(5)
df_180mv %<>% select(kmer, level_mean, level_stdv) %>% 
  rowwise() %>% 
  mutate(sampled_numbers = list(rnorm(n=1000, level_mean, level_stdv)))

sampled_numbers <- purrr::flatten(df_180mv$sampled_numbers)
sampled_numbers <- unlist(sampled_numbers, recursive = FALSE)

# Calculate mean and sd of AAAAA state, and all the kmers combined
mean_AAAAAA <- mean(unlist(df_180mv$sampled_numbers[1])) 
sd_AAAAAA <- sd(unlist(df_180mv$sampled_numbers[1])) 
mean_TTTTTT <- mean(unlist(df_180mv$sampled_numbers[4096])) 
sd_TTTTTT <- sd(unlist(df_180mv$sampled_numbers[4096])) 
mean_all_kmers <- mean(sampled_numbers) 
sd_all_kmers <- sd(sampled_numbers)

maximum_expected_normalized_AAAAAA_level  <- 
  ((mean_AAAAAA - 2 * sd_AAAAAA) - mean_all_kmers) / sd_all_kmers

maximum_expected_normalized_TTTTTT_level  <- 
  ((mean_TTTTTT - 2 * sd_TTTTTT) - mean_all_kmers) / sd_all_kmers

cat(paste0("Mean expected AAAAAA level before normalisation: ",
          abs(mean_AAAAAA), '\n'))
## Mean expected AAAAAA level before normalisation: 86.5127456992406
cat(paste0("Mean expected TTTTTT level before normalisation: ",
          abs(mean_TTTTTT), '\n'))
## Mean expected TTTTTT level before normalisation: 90.6877771232721
cat(paste0("Maximum expected normalized AAAAAA level: ", 
           abs(maximum_expected_normalized_AAAAAA_level), '\n'))
## Maximum expected normalized AAAAAA level: 0.518144121551562
cat(paste0("Maximum expected normalized TTTTTT level: ", 
           abs(maximum_expected_normalized_TTTTTT_level), '\n'))
## Maximum expected normalized TTTTTT level: 0.20178577950747

8 tailfindr tail length vs. moves in the tail region by the standard model basecaller

# get a subset of data (30 and 100 nt tails)
dna_kr_data_bc_30_100 <- dna_kr_data %>% 
  filter(barcode == 30 | barcode == 100)
cols <- c('#8c8c8c', '#e8b00b')
p <- ggplot(dna_kr_data_bc_30_100,
            aes(x = tail_length_st, y = moves_in_tail_st, colour = barcode)) +
  # geom_hline(yintercept = c(30, 100), color = cols, 
  #            linetype = 'dashed', size = 0.4) +
  # geom_vline(xintercept = c(30, 100), color = cols, 
  #            linetype = 'dashed', size = 0.4) +
  geom_point(alpha = 0.07, size = 0) +
  theme(panel.background = element_blank(),
        axis.line = element_line(color = "black", size = 0.4),
        legend.direction = "horizontal",
        legend.position = "bottom",
        rect = element_rect(fill = "transparent")) +
  xlim(0, 160) + ylim(0, 110) +
  scale_x_continuous(breaks = c(0, 30, 50, 100, 150), 
                     labels = c('0', '30', '50', '100', '150'),
                     limits = c(0, 160)) +
  scale_y_continuous(breaks = c(0, 30, 50, 100), 
                     labels = c('0', '30', '50', '100'),
                     limits = c(0, 110)) +
  scale_color_manual(values = cols) +
  xlab('tailfindr tail-length prediction (nt)') +
  ylab('Total number of moves in the tail') 

p <- ggExtra::ggMarginal(p, groupColour = TRUE, groupFill = TRUE) 

9 tailfindr tail length vs. moves in the tail region by the flipflop model basecaller

cols <- c('#8c8c8c', '#e8b00b')
p <- ggplot(dna_kr_data_bc_30_100,
            aes(x = tail_length_ff, y = moves_in_tail_ff, colour = barcode)) +
  # geom_hline(yintercept = c(30, 100), color = cols, 
  #            linetype = 'dashed', size = 0.4) +
  # geom_vline(xintercept = c(30, 100), color = cols, 
  #            linetype = 'dashed', size = 0.4) +
  geom_point(alpha = 0.07, size = 0) +
  theme(panel.background = element_blank(),
        axis.line = element_line(color = "black", size = 0.4),
        legend.direction = "horizontal",
        legend.position = "bottom",
        rect = element_rect(fill = "transparent")) +
  xlim(0, 160) + ylim(0, 110) +
  scale_x_continuous(breaks = c(0, 30, 50, 100, 150), 
                     labels = c('0', '30', '50', '100', '150'),
                     limits = c(0, 160)) +
  scale_y_continuous(breaks = c(0, 30, 50, 100), 
                     labels = c('0', '30', '50', '100'),
                     limits = c(0, 110)) +
  scale_color_manual(values = cols) +
  xlab('tailfindr tail-length prediction (nt)') +
  ylab('Total number of moves in the tail') 

p <- ggExtra::ggMarginal(p, groupColour = TRUE, groupFill = TRUE) 

10 RNA vs. DNA tail length comparison

We first load RNA data

loadd(rna_kr_data)

Because RNA and DNA dataset have different number of reads in each barcode, therefore, before comparing these two datasets, we make a new dataset in which there are equal number of reads in all barcode in both RNA and DNA conditions.

polyat_dna <- select(dna_kr_data, tail_length_st, barcode) %>% 
  mutate(barcode = as.numeric(as.character(barcode)))

polya_rna <-select(rna_kr_data, tail_length_tf, barcode) %>% 
  mutate(barcode = as.numeric(as.character(barcode)))

# tabulate reads in each DNA barcode
dna_barcode_nums <- as.data.frame(table(polyat_dna$barcode))
dna_barcode_nums <- 
  rename(dna_barcode_nums, barcode = Var1, n_desired = Freq) 
dna_barcode_nums %<>% mutate(barcode = as.numeric(as.character(barcode)))

# randomize rna data
set.seed(5)
polya_rna <- polya_rna[sample(nrow(polya_rna)),]
# inlclude only complete cases
polya_rna <- polya_rna[complete.cases(polya_rna), ]

# make a subset of RNA reads with same of reads in each barcode as that in DNA
polya_rna_subset <- left_join(polya_rna, dna_barcode_nums, by = "barcode") %>%
  group_by(barcode) %>% 
  slice(seq(first(n_desired))) %>%
  select(-n_desired)

# combine rna and dna datasets
polya_rna_subset %<>% rename(tail_length = tail_length_tf) %>% 
  mutate(data_type = 'RNA')

polyat_dna %<>%  mutate(data_type = 'DNA') %>% 
  rename(tail_length = tail_length_st)

df <- bind_rows(polyat_dna, polya_rna_subset)
df %<>% dplyr::mutate(
  data_type = as.factor(data_type),
  barcode = as.factor(barcode),
  tail_length = ifelse(tail_length > 300, 300, tail_length)
)

Now plotting the data:

p <- ggplot(data = df, aes(x = barcode, y = tail_length, 
                           color = data_type, fill = data_type)) +
  geom_two_sided_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .5) +
  facet_grid(~barcode, scales = 'free') +
  geom_hline(aes(yintercept = as.numeric(as.character(barcode)))) 

11 RNA vs. DNA table

rna_kr_data_tmp <- rna_kr_data %>% 
  mutate(read_type = 'polyA',
         data = 'RNA')

rna_summary <- rna_kr_data_tmp %>% 
  group_by(replicate, barcode, read_type, kit) %>% 
  summarize(read_count_rna = n())

dna_summary <- dna_kr_data %>% 
  group_by(replicate, barcode, read_type, kit) %>% 
  summarize(read_count_dna = n())

combined_summary <- 
  full_join(dna_summary, rna_summary, by = c('replicate', 'barcode', 'read_type')) %>% 
  rename(dna_kit = kit.x, rna_kit = kit.y)

kable(combined_summary)
replicate barcode read_type dna_kit read_count_dna rna_kit read_count_rna
1 10 polyA SQK-LSK108 4884 SQK-RNA001 3148
1 10 polyT SQK-LSK108 8572 NA NA
1 30 polyA SQK-LSK108 11953 SQK-RNA001 7724
1 30 polyT SQK-LSK108 13898 NA NA
1 40 polyA SQK-LSK108 6375 SQK-RNA001 12044
1 40 polyT SQK-LSK108 11294 NA NA
1 60 polyA SQK-LSK108 7293 SQK-RNA001 11679
1 60 polyT SQK-LSK108 7733 NA NA
1 100 polyA SQK-LSK108 2619 SQK-RNA001 5439
1 100 polyT SQK-LSK108 2357 NA NA
1 150 polyA SQK-LSK108 1502 SQK-RNA001 5219
1 150 polyT SQK-LSK108 1929 NA NA
2 10 polyA SQK-LSK109 578 SQK-RNA001 40425
2 10 polyT SQK-LSK109 2500 NA NA
2 30 polyA SQK-LSK109 1110 SQK-RNA001 28557
2 30 polyT SQK-LSK109 3189 NA NA
2 40 polyA SQK-LSK109 571 SQK-RNA001 279
2 40 polyT SQK-LSK109 2517 NA NA
2 60 polyA SQK-LSK109 968 SQK-RNA001 33222
2 60 polyT SQK-LSK109 2339 NA NA
2 100 polyA SQK-LSK109 396 SQK-RNA001 21120
2 100 polyT SQK-LSK109 809 NA NA
2 150 polyA SQK-LSK109 265 SQK-RNA001 16777
2 150 polyT SQK-LSK109 606 NA NA
3 10 polyA NA NA SQK-RNA002 3463
3 30 polyA NA NA SQK-RNA002 9356
3 40 polyA NA NA SQK-RNA002 13994
3 60 polyA NA NA SQK-RNA002 14690
3 100 polyA NA NA SQK-RNA002 9831
3 150 polyA NA NA SQK-RNA002 7271